#define _CRT_SECURE_NO_WARNINGS
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#define ARRAY_SIZE  65535
//一个中文长度
#define char_len 2 
//字典中长词的长度
#define MAX_KEYWARD_SIZE (12*char_len)
unsigned int JSHash(char* str)
{
	unsigned int hash = 1315423911;

	while (*str)
	{
		hash ^= ((hash << 5) + (*str++) + (hash >> 2));
	}

	return (hash & 0x7FFFFFFF);
}

//链表部分

typedef struct node_
{
	char* value;
	struct node_* pnext;
}node;


typedef struct {
	struct node_* head;
	unsigned size;
}list;

//头插链表
void push_front(list* n, char* val)
{
	node* newnode;
	newnode = malloc(sizeof(node));
	newnode->pnext = n->head;
	newnode->value = val;
	n->head = newnode;
	n->size++;
}
//清空链表
//参数2 是否是否数据
void clear_list(list* l, int is_free_data)
{
	node* cur, * temp;
	cur = l->head;
	while (cur)
	{
		if (is_free_data)
			free(cur->value);
		temp = cur;
		cur = cur->pnext;
		free(temp);
	}
	l->head = 0;
}


//简易hash set


typedef struct
{
	list m_arr[ARRAY_SIZE];
}hash_set;

hash_set* Set()
{
	hash_set* set;
	set = malloc(sizeof(hash_set));
	memset(set, 0, sizeof(hash_set));
	return set;
}
void insert(hash_set* set, char* val)
{
	unsigned hash;
	hash = JSHash(val) % ARRAY_SIZE;
	push_front(&set->m_arr[hash], val);
}

char* has(hash_set* set, char* val)
{
	unsigned hash;
	node* cur;
	hash = JSHash(val) % ARRAY_SIZE;
	cur = set->m_arr[hash].head;
	while (cur)
	{
		if (strcmp(cur->value, val) == 0)
		{
			return cur->value;
		}
		cur = cur->pnext;
	}
	return 0;

}



void clear_set(hash_set* set)
{
	for (size_t i = 0; i < ARRAY_SIZE; i++)
	{
		clear_list(&set->m_arr[i], 1);
	}
}


// 业务

#define _OUT_
void read_data(hash_set* set, const char* file);
list forward_segment(hash_set* set, char* val);
list backward_segment(hash_set* set, char* val);
list bidirectional_segment(hash_set* set, char* val, _OUT_ int* desc);
void print_list(list l);
void print_list_reverse(list l);

int main()
{
	hash_set* set;
	list fs;
	int is_desc;

	char source[4096];

	set = Set();
	read_data(set, "CoreNatureDictionary.mini.txt");


	printf("输入需要分词的中文文本:");
	scanf("%s", source);

	fs = bidirectional_segment(set, source, &is_desc);

	if (is_desc)
	{
		print_list_reverse(fs);
	}
	else
	{
		print_list(fs);
	}

	clear_list(&fs, 0);
	clear_set(set);
	free(set);
}

void read_data(hash_set* set, const char* file)
{
	FILE* pf;
	char str[128];
	char* val;
	int len;
	assert(pf = fopen(file, "r"));

	while (!feof(pf))
	{
		fscanf(pf, "%s", str);
		len = strlen(str);
		val = malloc(len + 1);
		strcpy(val, str);
		insert(set, val);
		fgets(str, 128, pf);
	}
}

list forward_segment(hash_set* set, char* val)
{
	list l;
	int i, j;
	size_t end, len;
	char cur[13 * char_len], * finded;
	l.head = 0;
	l.size = 0;
	len = strlen(val);
	for (i = 0; i < len; i += char_len)
	{
		end = i + MAX_KEYWARD_SIZE;
		if (end > len)
		{
			end = len;
		}
		for (j = end; j >= i; j -= char_len)
		{
			memset(cur, 0, 13 * char_len);
			strncpy(cur, val + i, j - i);
			if (finded = has(set, cur))
			{
				push_front(&l, finded);
				i += strlen(finded) - char_len;
				break;
			}
		}
	}

	return l;
}

list backward_segment(hash_set* set, char* val)
{
	list l;
	size_t len;
	int i, j, end;
	char cur[13 * char_len];
	char* finded;
	len = strlen(val);
	l.head = 0;
	l.size = 0;
	for (i = len; i >= 0; i -= char_len)
	{
		end = i - MAX_KEYWARD_SIZE;
		if (end < 0)end = 0;
		for (j = end; j < i; j += char_len)
		{
			memset(cur, 0, 13 * char_len);
			strncpy(cur, val + j, i - j);
			if (finded = has(set, cur))
			{
				push_front(&l, finded);
				i = j + char_len;
				break;
			}
		}
	}
	return l;
}


void do_reverse(node* n)
{
	if (!n) return;
	do_reverse(n->pnext);
	printf("%s ", n->value);
}

void print_list_reverse(list l)
{
	do_reverse(l.head);
	putchar(10);
}
size_t count_single_char(list l)
{
	size_t count;
	node* cur;
	count = 0;
	cur = l.head;
	while (cur)
	{
		if (strlen(cur->value) == char_len)
		{
			count++;
		}
		cur = cur->pnext;
	}
	return count;
}

list bidirectional_segment(hash_set* set, char* val, _OUT_ int* desc)
{
	list fs, bs;
	fs = forward_segment(set, val);
	bs = backward_segment(set, val);
	//分词数量少优先
	if (bs.size > fs.size)
	{
		clear_list(&bs, 0);
		*desc = 1;
		return fs;
	}
	if (bs.size < fs.size)
	{
		clear_list(&fs, 0);
		*desc = 0;
		return bs;
	}
	//单字词少 次先
	if (count_single_char(fs) > count_single_char(bs))
	{
		clear_list(&fs, 0);
		*desc = 0;
		return bs;
	}
	*desc = 1;
	clear_list(&bs, 0);
	return fs;
}

void print_list(list l)
{
	node* cur;
	cur = l.head;
	while (cur)
	{
		printf("%s ", cur->value);
		cur = cur->pnext;
	}
	printf("\n");
}